## HILDA GROUPED
## This script creates one consistent HILDA dataset that includes all HILDA data for all waves used in other model-related scripts, 
## including new variables.

## Constructed variables include:
## - Total inc, total wealth
## - Bequests, gifts
## - Age group, income group
## - Parent age group


# Prelims -----------------------------------------------------------------


rm(list=ls())
gc()


## hilda file paths
hilda_qs_path <- "./HILDA Wave 19/qs files/"

## get file names of combined dta files
combined_files <- list.files(hilda_qs_path) %>% 
  str_subset("Combined_.+") %>% 
  str_remove(".qs")

## file name of first estate bequest indicator
first_estate_path <-  paste0(hilda_qs_path, "first_estate_bequest_id.qs")

## file name of indiv wealth data 
indiv_wealth_path <- paste0(hilda_qs_path, "indiv_wealth_vars_restricted.qs")
## Indiv wealth data for waves 2, 6, 10, 14, 18 


## dataframe for matching year / wave ID / wave letter in HILDA
waves <- tibble(letter = letters[1:length(combined_files)], 
                year = c(2001:(2000+length(combined_files)))
) %>% 
  rowid_to_column(., "id")



# Read relevant HILDA variables and create longitudinal dataset -------------------------------------------


hilda_vars <- c(
  "xwaveid",                      # cross wave ID
  "hhrhid",                       # Wave randomised _hhrhid household ID
  "wavenumber",                   # generated wave number
  "hhwtrp$", "hhwte$",             # responding person weight (15+) and enumerated person weight
  "hhwth", "hhpno",                # household weight, and hh person number (could use as hh reference person)
  "hgage$", "hgyob",               # age last birthday at June 30 and year of birth
  
  ## INCOME VARIABLES
  "tifdip", "tifdin",             # FY disposable regular income 
  "tifefp", "tifefn",             # FY gross regular income
  "wsfes",                       # FY wages and salary
  "oifsupi",                       # FY regular super and annuities income
  "oiinti",                        # FY interest
  "oidvryi",                       # FY dividends and royalties
  "txtotp", "txtotn",             # FY income taxes and medicare levy
  
  ## BEQUESTS
  "oifinh",                       # Received bequest? 1 = yes
  "oifinha",                      # FY inheritance/bequests
  
  ## INTERGEN GIFTS
  "oifpnt",                       ## Received transfer from parent? 1 = Yes
  "oifpnta",                       # FY transfers from parents ($)
  
  ## WEALTH VARIABLES - to be read in separately from Lawson's analysis file
  
  ## FAMILY LINKAGE VARIABLES
  "hhfxid", "hhmxid",            # father / mother x wave ID (if co-resident, and includes non-biological)
  "psb$", "psflive", "psmlive",    # if both parents / father / mother alive
  "psfage", "psmage",             # father / mother age 
  "psyobf", "psyobm",             # father / mother year of birth 
  "psyodf", "psyodm",             # father / mother year of death 
  
  ## CHILDREN VARIABLES
  "tcr$",                         # Number of own resident children
  "tcnr$",                         # Number of own non-resident children
  "fmnsib",                         # Number of siblings
  "fmhsib",                         # ever had siblings
  
  ## HOUSING VARIABLES
  "hsyr",                            # year purchased home (available in wealth years)
  "hsprice",                         # price of home when purchased
  "hwhmvai",                          ## HH wealth, home apportioned value
  "hsvalui",                         ## annual - DV: Home value ($) [imputed]
  "hsdebti",                         ## annual - DV: Total Home Debt ($) [imputed]
  "hspown",                           ## waves 1&2 - Any owners owned a home previously
  "hsmgi",                            # Mortgage usual repayments $ per month [imputed]
  
  ## RELATIONSHIP STATUS
  "mrcurr",                          ## DV: Marital status from person questionnaire (1/2 = legally married/de facto)
  
  ## HOUSEHOLD VARS for misc calcs
  "hifefp", "hifefn",                ## hh gross regular income
  "hwtpvi"                           ## household total property wealth
  
)

hilda_data_orig <- lapply(c(1:19),
                          function(n) {
                            ## read in qs data
                            qread(paste0(hilda_qs_path, combined_files[n], ".qs")) %>% 
                              ## select variables to keep (will still run if a wave doesn't contain all listed vars)
                              select(matches(hilda_vars))
                          }
) %>% 
  ## bind all waves together, filling missing columns with NA
  rbindlist(fill=T) %>% 
  ## initial cleaning
  ## if variable is an empty character, replace with NA
  mutate_if(is.character, ~ifelse(. =="", NA_character_, .)) %>% 
  ## convert xwaveid to numeric to allow merging w indiv wealth
  mutate(across(c(xwaveid, hhfxid, hhmxid), as.numeric))



# Attach additional HILDA-related data  -------------------------------------

## read in individual wealth data 
indiv_wealth <- qread(indiv_wealth_path) 

## read in first estate bequest id
first_estate_bequest_id <- qread(first_estate_path) ## for responding persons

## attach to hilda_data_orig
hilda_data_w_wealth <- hilda_data_orig %>% 
  left_join(indiv_wealth, by=c("xwaveid", "wavenumber"="wave")) %>% 
  left_join(first_estate_bequest_id, by=c("xwaveid", "wavenumber"="wave")) %>% 
  mutate(first_estate_bequest = ifelse( is.na(first_estate_bequest), 0, first_estate_bequest) )




# Create new variables ----------------------------------------------------

## number of income quantiles
inc_grps <- 5


hilda_grouped0 <- hilda_data_w_wealth %>% 
  ## create some combined vars for inc and wealth
  mutate(
    ## create combined income variable (pos and neg vals)
    total_inc_combined = case_when(
      tifefp>0 & !is.na(tifefp) ~ tifefp,
      tifefn>0 & !is.na(tifefn) ~ 0 - tifefn,
      tifefp==0 & tifefn==0 ~ 0
    ) ,
    ## remove regular super, interest and dividends from total for consistency with model use and avoid double counting - these income sources incorporated elsewhere in model
    total_inc = total_inc_combined - oifsupi - oiinti - oidvryi,
    
    ## wealth and inc convenience variables
    wages_inc = wsfes,
    
    total_assets = pwassei,
    housing_assets = pwtpvi,
    super_assets = pwsupei,
    total_debt = pwdebti,
    housing_debt = pwtpdi,
    
    ## create "other inc" and "other wealth" variables
    other_inc = total_inc - wages_inc, ## all non wage income
    other_assets = total_assets - housing_assets - super_assets, ## all non housing non super assets
    other_debt = total_debt - housing_debt
  ) %>% 
  
  
  # custom bequest variable including 0 if asked the question. and replace with 0 if it's a likely first estate
  mutate(bequests = case_when(
    oifinh==0 ~ 0,
    oifinha>=0 & first_estate_bequest!=1 ~ as.numeric(oifinha), ## negative values mean missing data - will be treated as NA
    oifinha>=0 & first_estate_bequest==1 ~ 0
  ) ,
  # custom parental gifts variable
  gifts = case_when(
    oifpnt==0 ~ 0,
    oifpnta>=0 ~ as.numeric(oifpnta) ## negative values mean missing data - will be treated as NA
  )) %>% 
  
  ## create cohort variables - age group, income group, wealth group
  mutate(
    ## age group - every 5 years
    age_grp = cut(hgage, breaks=seq(0, 105, 5), right=F, include.lowest=T, ordered_result=T),
    age_grp2 = cut(hgage, c(0, breaks=seq(15, 85, 5), 105), right=F, include.lowest=T, ordered_result=T)
  ) %>% 
  
  ## break the dataset up into waves to calculate the weighted quantiles of inc and wealth by wave
  split(., .$wavenumber) %>% 
  
  ## for each wave, create the following weighted quantile variables
  ## Note: the wtd.quantile function takes a while to run
  lapply(., 
         function(x) {
           ## create a total inc quantile variable, weighted to population
           wave_w_qtiles <- x %>% 
             mutate(    ## weighted quantile of gross income
               total_inc_qtile = cut(total_inc, 
                                     ## cut points determined by weighted quantile (only works if cut points are unique)
                                     breaks = Hmisc::wtd.quantile(.$total_inc, 
                                                                  weights = .$hhwte, 
                                                                  probs = seq(0, 1, 1/inc_grps)), 
                                     include.lowest=T,
                                     labels = c(1:inc_grps),
                                     ordered_result=T))
           
           wave_w_qtiles
           
         }) %>% 
  ## put all waves back together into same dataframe
  rbindlist(fill=T)


## df of wavenumber, xwaveid and age only - for help determining parent age below
hilda_id_ages <- hilda_data_orig %>% 
  select(wavenumber, xwaveid, hgage)


## new parent age variables
hilda_grouped1 <- hilda_grouped0 %>% 
  ## merge in data on parent ages where resident in household
  ## father age:
  left_join(hilda_id_ages %>% rename(father_age_merged=hgage),
            by = c("wavenumber", "hhfxid"="xwaveid")) %>% 
  ## mother age:
  left_join(hilda_id_ages %>% rename(mother_age_merged=hgage),
            by = c("wavenumber", "hhmxid"="xwaveid")) %>% 
  ## fill in all parent age-related variables in waves asked
  mutate(mother_yob = psyobm, father_yob = psyobf, 
         mother_yod = psyodm, father_yod = psyodf,
         parent_alive = psb) %>% 
  group_by(xwaveid) %>% 
  arrange(xwaveid, wavenumber) %>% 
  ## fill up and then down missing values by person id - takes a while to run
  mutate(across(c(mother_yob, father_yob, mother_yod, father_yod, parent_alive), 
                ~ nafill(.x, type="nocb") %>% nafill(type="locf"))) %>% 
  ungroup

hilda_grouped <- hilda_grouped1 %>% 
  ## calculate parent age if parent still alive in that year (only available for responding persons)
  ## note yod -1 = not asked, -2 = not applicable
  ## if yod or yob not available for the year, can use mother/father age from merged data
  mutate(mother_age = case_when( 
    (mother_yod>0 & wavenumber+2000<=mother_yod & mother_yob>0) | (mother_yod %in% c(-1,-2) & mother_yob>0) ~ wavenumber+2000-mother_yob,
    TRUE ~ as.numeric(mother_age_merged)
  ),
  father_age = case_when( 
    (father_yod>0 & wavenumber+2000<=father_yod & father_yob>0) | (father_yod %in% c(-1,-2) & father_yob>0) ~ wavenumber+2000-father_yob,
    TRUE ~ as.numeric(father_age_merged)
  )
  ) %>% 
  ## throw out infeasible ages
  mutate(mother_age = ifelse(mother_age>110, NA, mother_age),
         father_age = ifelse(father_age>110, NA, father_age)) %>% 
  ## lowest age of living parents - for determining gift giver/receipt and bequest distributions later
  mutate(parent_age = case_when(
    !is.na(mother_age) & !is.na(father_age)  ~ pmin(mother_age,father_age), ## if both alive, take the youngest age
    !is.na(mother_age)  ~ mother_age, ## if only mother alive, use age of mother
    !is.na(father_age)  ~ father_age ## if only father alive, use age of father
  )
  ) %>% 
  ## if any >105, cap at 105 to put them in highest age group
  mutate(parent_age = ifelse(parent_age>105, 105, parent_age)) %>% 
  ## create grouping variable for parent age
  mutate(parent_age_grp = cut(parent_age, breaks=seq(0, 105, 5), right=F, include.lowest=T, ordered_result=T)) %>% 
  arrange(wavenumber, xwaveid)


qsave(hilda_grouped, "./Input data/Intermediate input data/hilda_grouped_master.qs")